### This R script is to fill out Table 1 for the teacher working conditions paper

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")

# open necessary packages
require(dplyr)

## Import data
data_17 <- read.csv("core_17.csv")
data_18 <- read.csv("core_18.csv")
data_19 <- read.csv("core_19.csv")
data_21 <- read.csv("core_21.csv")
data_22 <- read.csv("core_22.csv")
data_23 <- read.csv("core_23.csv")

data <- list(data_17,data_18,data_19,data_21,data_22,data_23) 
names(data) <- paste("data_",c(17:19,21:23),sep="")

# drop individual data objects
rm(data_17)
rm(data_18)
rm(data_19)
rm(data_21)
rm(data_22)
rm(data_23)

## Make Table structure: seven columns, nrow=length(data). Reverse the order
## of the rows later.
Table_1 <- matrix(NA, length(data), 7); Table_1 <- as.data.frame(Table_1)
rownames(Table_1) <- names(data) 
colnames(Table_1) <- c("survey_schools", "fraction_of_total_schools","enrollment_survey_schools",
                       "fraction_of_total_students", "n_teachers_survey_schools", 
                       "teacher_response_rate","n_teachers_by_rate")

## Column 1 - number of 5E schools
## For column 1, we are only counting schools that have enough teacher respondents 
## to publish the teacher variables. 
## Code this using an “if” statement that doesn’t count the school if there is a full row of NAs 
## in the teacher variables. Drop schools that don't meet the criteria since we only want schools
## with teacher variables for Table 1. 

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data$data_18 <- data$data_18 %>% rename(cdis1=cdis)
data$data_19 <- data$data_19 %>% rename(cdis1=cdis)

## Import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

# write function to drop rows without teacher variables (full row of NAs in the teacher vars)
drop_rows_without_teacher_vars <- function(data){
  dummy <- data %>% select(any_of(varnames)) %>% apply(., 1, function(x) sum(!is.na(x))) > 0 
  data <- filter(data, dummy)
  return(data)
}

test <- drop_rows_without_teacher_vars(data$data_18)
nrow(data$data_18)-nrow(test)
summary(test) # should have teacher_n and teacher_response_rate be large, and have fewer NAs in teacher vars than before

# apply the function to the data list
data_backup <- data
data <- lapply(data, drop_rows_without_teacher_vars)

# check work
n_dropped <- sapply(data_backup, nrow)-sapply(data, nrow) # How many schools were dropped?

sapply(data_backup, function(x) summary(x$teacher_n))
sapply(data, function(x) summary(x$teacher_n))

# Fill in column 1
Table_1$survey_schools <- sapply(data,nrow)

## Column 2 - 5E Schools as a fraction of all schools listed in the CCD.
n_schools_CCD <- read.csv("n_schools_CCD.csv")

Table_1$fraction_of_total_schools <- sapply(data,nrow)/n_schools_CCD$N_schools

## Column 3 - number of students in 5E schools
## Use the enrollment data from CCD to count the number of students ("enrollment" variable) since the 
## student_n variable from the survey doesn’t include all students. 
Table_1$enrollment_survey_schools <- sapply(data, function(x) sum(x$enrollment))
                   
## Column 4 - number of students 5E schools as fraction of total enrollment in CCD. 
## Use total statewide public enrollment from digest of education statistics for this.
digest <- read.csv("digest of ed stats numbers Table_1.csv")
digest$statewide_public_enrollment <- as.numeric(gsub(",","",digest$statewide_public_enrollment))
Table_1$fraction_of_total_students <- Table_1$enrollment_survey_schools/digest$statewide_public_enrollment

## Column 5 - number of teachers in 5E schools
Table_1$n_teachers_survey_schools <- sapply(data, function(x) sum(x$teacher_n))

## Column 6 - teacher survey response rate (Conditional on school sampling)
## Weighted average of teacher response rate, with teacher_n as weight
Table_1$teacher_response_rate <-sapply(data, function(x) weighted.mean(x$teacher_response_rate, x$teacher_n))

## Column 7: Estimated number of teacher respondents (number*rate)
## Product of (col 5)*(col 6)
Table_1$n_teachers_by_rate <- Table_1$n_teachers_survey_schools*(Table_1$teacher_response_rate/100)

## Reverse the order of the rows
Table_1 <- Table_1[nrow(Table_1):1,]

## Round values
Table_1$n_teachers_by_rate <- round(Table_1$n_teachers_by_rate)
Table_1 <- round(Table_1, 2)

## Export Table
write.csv(Table_1, "Table_1.csv")
